{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Open the wikipedia dataset\n",
    "data = pd.read_csv('../data/glove.6B.50d.txt', sep=' ', header=None, quoting=csv.QUOTE_NONE, nrows=10000)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Keep only the characters\n",
    "mask_characters = data.iloc[:,0].str.match('^[a-zA-Z]')  # Bool mask of only characters\n",
    "data_only_char = data[mask_characters]  # Apply mask on data\n",
    "df_3000 = data_only_char.iloc[:3000]  # Select only 3000 first data points\n",
    "df_3000.set_index(0, inplace=True)\n",
    "df_3000.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Do the same for bigger dimensions, and then save them\n",
    "\n",
    "def clean(filename):\n",
    "    data = pd.read_csv(filename, sep=' ', header=None, quoting=csv.QUOTE_NONE, nrows=10000)\n",
    "    mask_characters = data.iloc[:,0].str.match('^[a-zA-Z]', na=False)  # Bool mask of only characters\n",
    "    data_only_char = data[mask_characters]  # Apply mask on data\n",
    "    df_3000 = data_only_char.iloc[:3000]  # Select only 3000 first data points\n",
    "    df_3000.set_index(0, inplace=True)\n",
    "    \n",
    "    return df_3000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "twitter = clean('../data/glove.twitter.27B.200d.txt')\n",
    "print(twitter.shape)\n",
    "twitter.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for val in twitter.index:\n",
    "    print (val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/twitter_3000.csv', encoding = \"ISO-8859-1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "crawler = clean('../data/glove.42B.300d.txt')\n",
    "print(crawler.shape)\n",
    "crawler.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wikipedia = clean('../data/glove.6B.300d.txt')\n",
    "print(wikipedia.shape)\n",
    "wikipedia.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wikipedia.to_csv('../data/wikipedia_3000.csv')\n",
    "crawler.to_csv('../data/crawler_3000.csv')\n",
    "twitter.to_csv('../data/twitter_3000.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
